PM 566 Lab 6

Author

Dana Gonzalez

Setup Packages

library(tidytext)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)

Read in Medical Transcriptions

library(readr)
library(dplyr)
mt_samples <- read_csv("https://raw.githubusercontent.com/USCbiostats/data-science-data/master/00_mtsamples/mtsamples.csv")
New names:
Rows: 4999 Columns: 6
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(5): description, medical_specialty, sample_name, transcription, keywords dbl
(1): ...1
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
mt_samples <- mt_samples %>%
  select(description, medical_specialty, transcription)

head(mt_samples)
# A tibble: 6 × 3
  description                                    medical_specialty transcription
  <chr>                                          <chr>             <chr>        
1 A 23-year-old white female presents with comp… Allergy / Immuno… "SUBJECTIVE:…
2 Consult for laparoscopic gastric bypass.       Bariatrics        "PAST MEDICA…
3 Consult for laparoscopic gastric bypass.       Bariatrics        "HISTORY OF …
4 2-D M-Mode. Doppler.                           Cardiovascular /… "2-D M-MODE:…
5 2-D Echocardiogram                             Cardiovascular /… "1.  The lef…
6 Morbid obesity.  Laparoscopic antecolic anteg… Bariatrics        "PREOPERATIV…

Question 1

Count of Specialty Types

mt_samples %>%
  count(medical_specialty, sort = TRUE)
# A tibble: 40 × 2
   medical_specialty                 n
   <chr>                         <int>
 1 Surgery                        1103
 2 Consult - History and Phy.      516
 3 Cardiovascular / Pulmonary      372
 4 Orthopedic                      355
 5 Radiology                       273
 6 General Medicine                259
 7 Gastroenterology                230
 8 Neurology                       223
 9 SOAP / Chart / Progress Notes   166
10 Obstetrics / Gynecology         160
# ℹ 30 more rows

Running this code, we get 40 specialty types. However we must note that some of the “specialty types” included in this count include items such as “SOAP / Chart / Progress Notes” and “Discharge Summary”, which may not be actual specialty types and thus distort this count. Not counting these types of items, we get approximately 34 specialty types.

Barchart of Observations per Specialty Type

mt_samples |>
  ggplot() + 
  geom_bar(mapping = aes(x = medical_specialty), fill = "goldenrod") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  labs(title = "Counts of Observations per Specialty Type",
       x = "Specialty Type",
       y = "Count")

Looking at this barchart, we can clearly see that the medical specialties with the majority of the dataset’s observations are Surgery, Consult - History and Phy., and Cardiovascular/Pulmonary. As mentioned above, some of these “specialty types”, such as Consult - History and Phy., may not be actual specialty types but a type of visit or note.

Question 2

Tokenize Words in “Transcription” Column + Token Counts

tokenized <- mt_samples |>
  unnest_tokens(word, transcription) |>
  count(word, sort = TRUE)

Visualize the Top 20 Most Frequent Words

tokenized_counts <- mt_samples |>
  unnest_tokens(word, transcription) |>
  count(word, sort = TRUE) |>
  slice_max(n, n = 20)

tokenized_counts |>
  ggplot(aes(n, word)) +
  geom_col(fill = "goldenrod") +
  labs(title = "Top 20 Most Frequent Words from 'Transcription' Column",
       x = "Count",
       y = "Word") +
  theme_minimal()

Looking at this barchart, the top 20 most frequent word in the “transcription” column are fairly common words across all contexts (i.e., “stopwords” like “the”, “and,”was). Because of this, we don’t get much insight into this data from this count alone.

Question 3

Visualization without Stopwords and Numbers

tokenized_counts <- mt_samples |>
  unnest_tokens(word, transcription) |>
  anti_join(stop_words, by = c("word" = "word")) |>
  filter(!grepl("[0-9]", word)) |>
  count(word, sort = TRUE) |>
  slice_max(n, n = 20)

tokenized_counts |>
  ggplot(aes(n, word)) +
  geom_col(fill = "goldenrod") +
  labs(title = "Top 20 Most Frequent Words from 'Transcription' Column (Without Stopwords)",
       x = "Count",
       y = "Word") +
  theme_minimal()

Yes, removing stopwords and numbers provides us with a much idea of what this dataset is about. Looking at this barchart, we can clearly see that this dataset is medical/clinical in nature, with words like “patient”, “procedure”, “anesthesia”, etc. Too, with words like “anesthesia”, “incision”, and “removed” we are reminded of that “Surgery” was the most frequent specialty type according to our analysis above.

Question 4

Tokenize into Bi-Grams + Token Counts

mt_samples |>
  unnest_ngrams(ngram, transcription, n = 2) |>
  count(ngram, sort = TRUE)
# A tibble: 301,419 × 2
   ngram           n
   <chr>       <int>
 1 the patient 20307
 2 of the      19062
 3 in the      12790
 4 to the      12374
 5 was then     6956
 6 and the      6350
 7 patient was  6293
 8 the right    5509
 9 on the       5241
10 the left     4860
# ℹ 301,409 more rows

Visualize the Top 20 Most Frequent Bi-Grams

tokenized_bi_counts <- mt_samples |>
  unnest_ngrams(ngram, transcription, n = 2) |>
  count(ngram, sort = TRUE) |>
  slice_max(n, n = 20)

tokenized_bi_counts |>
  ggplot(aes(n, ngram)) +
  geom_col(fill = "goldenrod") +
  labs(title = "Top 20 Most Frequent Bi-Grams from 'Transcription' Column",
       x = "Count",
       y = "Bi-Gram") +
  theme_minimal()

Tokenize into Tri-Grams + Token Counts

mt_samples |>
  unnest_ngrams(ngram, transcription, n = 3) |>
  count(ngram, sort = TRUE)
# A tibble: 655,442 × 2
   ngram                  n
   <chr>              <int>
 1 the patient was     6104
 2 the patient is      3075
 3 as well as          2243
 4 there is no         1678
 5 the operating room  1532
 6 patient is a        1491
 7 prepped and draped  1490
 8 was used to         1480
 9 and draped in       1372
10 at this time        1333
# ℹ 655,432 more rows

Visualize the Top 20 Most Frequent Tri-Grams

tokenized_tri_counts <- mt_samples |>
  unnest_ngrams(ngram, transcription, n = 3) |>
  count(ngram, sort = TRUE) |>
  slice_max(n, n = 20)

tokenized_tri_counts |>
  ggplot(aes(n, ngram)) +
  geom_col(fill = "goldenrod") +
  labs(title = "Top 20 Most Frequent Tri-Grams from 'Transcription' Column",
       x = "Count",
       y = "Bi-Gram") +
  theme_minimal()

By comparing the barcharts for bi-grams and tri-grams, we see that there is a much greater number of observations per bi-gram than for the tri-grams. This makes sense, as bi-grams (two words) are less specific and may thus occur more frequently in the dataset than tri-grams (three words). Too, the bi-grams “distribution” could be considered to be multimodal (most common bi-grams: “the patient”, “of the”, “to the” and “in the”). Contrastingly, the “distribution” for the tri-grams is very clearly unimodal with the most common tri-gram being “the patient was”.

Question 5

chosen_word <- "history"

before_after_counts <- mt_samples |>
  unnest_tokens(word, transcription) |>
  mutate(next_word = lead(word), previous_word = lag(word)) |>
  filter(next_word == chosen_word | previous_word == chosen_word) |>
  mutate(position = ifelse(next_word == chosen_word, "before", "after")) |>
  count(word = ifelse(next_word == chosen_word, previous_word, next_word), 
        position, sort = TRUE)

before_after_counts
# A tibble: 2,112 × 3
   word         position     n
   <chr>        <chr>    <int>
 1 past         before    1572
 2 present      after     1060
 3 patient      after      717
 4 has          before     396
 5 for          after      393
 6 with         before     390
 7 is           after      352
 8 and          before     279
 9 a            before     270
10 hypertension after      167
# ℹ 2,102 more rows

Question 6

Top 20 Words by Specialty without Stopwords and Numbers

tokenized_specialty_counts <- mt_samples |>
  unnest_tokens(word, transcription) |>
  anti_join(stop_words, by = c("word" = "word")) |>
  filter(!grepl("[0-9]", word)) |>
  count(medical_specialty, word, sort = TRUE) |>
  group_by(medical_specialty) |>
  slice_max(n, n = 20) |>
  ungroup()

tokenized_specialty_counts |>
  ggplot(aes(n, word, fill = medical_specialty)) +
  geom_col(show.legend = TRUE) +
  labs(title = "Top 20 Most Frequent Words by Medical Specialty (Without Stopwords)",
       x = "Count",
       y = "Word") +
  theme_minimal() +
  facet_wrap(~medical_specialty, scales = "free_y")

Top 5 Words by Specialty without Stopwords and Numbers

tokenized_specialty_counts2 <- mt_samples |>
  unnest_tokens(word, transcription) |>
  anti_join(stop_words, by = c("word" = "word")) |>
  filter(!grepl("[0-9]", word)) |>
  count(medical_specialty, word, sort = TRUE) |>
  group_by(medical_specialty) |>
  slice_max(n, n = 5) |>
  ungroup()

tokenized_specialty_counts2 |>
  ggplot(aes(n, word, fill = medical_specialty)) +
  geom_col(show.legend = TRUE) +
  labs(title = "Top 5 Most Frequent Words by Medical Specialty (Without Stopwords)",
       x = "Count",
       y = "Word") +
  theme_minimal() +
  facet_wrap(~medical_specialty, scales = "free_y")